# # Maximum Number of Threads
# import os
# os.environ['NUMEXPR_MAX_THREADS'] = '24'
# Text
from colorama import Fore, Back, Style
# Computations
import numpy as np
import pandas as pd
# statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm
# pymc3
from pymc3 import Model, sample, Normal, HalfCauchy, Uniform
from pymc3 import Deterministic, model_to_graphviz, forestplot, plot_posterior
# preprocessing
from sklearn import preprocessing
# VSIualisation libraries
## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## missingno
import missingno as msno
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
## arviz
import arviz as az
import warnings
warnings.filterwarnings("ignore")
The files consist of a list of product listings. These files are tab-delimited.
| Columns | Description |
|---|---|
| train_id or test_id | the id of the listing |
| name | the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage. These removed prices are represented as (rm) |
| item_condition_id | the condition of the items provided by the seller |
| category_name | category of the listing |
| brand_name | |
| price | the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict. |
| shipping | 1 if shipping fee is paid by seller and 0 by buyer |
| item_description | the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage. These removed prices are represented as (rm) |
We will model price for all categories. Our estimate of the parameter of product price can be considered a prediction. This article is inspired by A Primer on Bayesian Methods for Multilevel Modeling.
Data = pd.read_csv('mercari-price-suggestion-challenge/train.tsv', sep = '\t')
Data.columns = [x.title().replace('Id','ID') for x in Data.columns]
Data.rename(columns = {'Shipping':'Shipping Fee'}, inplace = True)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Size'] = Inp.shape[0]
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
display(Data_info(Data))
_ = msno.bar(Data, figsize=(6,4), fontsize=14, log=False, color="#34495e")
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Category_Name'].agg({'count'}).\
rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
del Temp0
C = ['greenyellow', 'seagreen']; SC = 'DarkGreen'
fig = px.bar(Temp, y= 'Category_Name', x= 'Count', orientation='h',
color = 'Shipping Fee', text = 'Count', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 7e4])
fig.update_layout(xaxis = dict(tickmode = 'array', tickvals = [int(x*10e3) for x in np.arange(0,8)]))
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Price'].agg({'mean'}).\
rename(columns = {'mean':'Average Price'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
Temp = Temp.round(2)
del Temp0
C = ['MistyRose', 'FireBrick']; SC = 'DarkRed'
fig = px.bar(Temp, y= 'Category_Name', x= 'Average Price', orientation='h',
color = 'Shipping Fee', text = 'Average Price', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 150])
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Price'].agg({'sum'}).\
rename(columns = {'sum':'Total'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
Temp = Temp.round(2)
del Temp0
C = ['Azure', 'RoyalBlue']; SC = 'DarkBlue'
fig = px.bar(Temp, y= 'Category_Name', x= 'Total', orientation='h',
color = 'Shipping Fee', text = 'Total', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 2.5e6])
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Data.dropna(subset = ['Category_Name'], inplace = True)
# a random sample of the dataset
Data = Data.sample(frac=0.01, random_state=99)
display(Data_info(Data))
_ = msno.bar(Data, figsize=(6,4), fontsize=14, log=False, color="#34495e")
Temp = Data[['Price', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')
C = ['aquamarine', 'steelblue']; SC = 'Navy'
fig = px.histogram(Temp, x = 'Price', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
color_discrete_sequence = C,
title = 'Probability Density (Price)')
fig['layout']['xaxis'].update(range=[0, 100])
fig['layout']['yaxis'].update(range=[0, .1])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()
le = preprocessing.LabelEncoder()
Data['Category'] = le.fit_transform(Data['Category_Name'])
Data['Price_Log'] = Data['Price'].apply(lambda x: np.log(x+0.1))
Category_df = Data.loc[Data.index.isin(Data['Category'].drop_duplicates().index),
['Category_Name','Category']].sort_values(by=['Category']).reset_index(drop = True)
Temp = Data[['Price_Log', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')
C = ['aquamarine', 'steelblue']; SC = 'Navy'
fig = px.histogram(Temp, x = 'Price_Log', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
color_discrete_sequence = C,
title = 'Probability Density (Price (Logarithm))')
fig['layout']['xaxis'].update(range=[-4, 8])
fig['layout']['yaxis'].update(range=[0, 3])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()
In this section, we would like to do a panel data analysis in which data are often collected over time and the same individuals. Then a regression is run over these two dimensions [1].
A common panel data regression model: $$y_{it}= \alpha + \beta x_{it}+\varepsilon _{it},$$ where
For more details, please see [1, 2].
Useful functions throughout the article:
def df_search(df, key): return [s for s in df.columns if key in s]
def list_search(List, key): return [s for s in List if key in s]
LaTex = {'alpha': r'$\alpha$', 'beta': r'$\beta$', 'sigma': r'$\sigma$',
'sigma_alpha': r'$\sigma_{\alpha}$', 'sigma_beta': r'$\sigma_{\beta}$', 'sigma_y': r'$\sigma_{y}$',
'mu_alpha': r'$\mu_{\alpha}$', 'mu_beta': r'$\mu_{\beta}$'}
In complete pooling, we combine all the information from all the categories into a single pool of data. Thus, treat all cathegoris the same, and estimate a single price level. $$y_{i}= \alpha + \beta x_{i}+\varepsilon _{i},$$
However, a problem with this approach is the level might be different for different categories.
The model can be defind with the following considersations: \begin{align} \begin{cases} \alpha~\sim~N\left(0,\sigma^2\right),\\ \beta~\sim~N\left(0,\sigma^2\right),\\ \end{cases} \end{align}
where $\sigma$ has a half-Cauchy distribution. A half-Cauchy is one of the symmetric halves of the Cauchy distribution (if it is unspecified, the right half that's intended)
Cauchy Probability Density Function: $$\text{Cauchy}(y|\mu,\sigma) = \frac{1}{\pi \sigma} \ \frac{1}{1 + \left((y - \mu)/\sigma\right)^2}.$$
Moreover, let
| Parameter | Description |
|---|---|
| x | Shipping Fee |
| y | Price (Log) |
To implement this, we use PyMC3.
x = Data['Shipping Fee'].values
with Model() as pooled_model:
alpha = Normal('alpha', mu = 0, sigma = 1e5, shape=1)
beta = Normal('beta', mu = 0, sigma = 1e5, shape=1)
sigma = HalfCauchy('sigma', 5)
theta = alpha + beta*x
y = Normal('y', theta, sigma=sigma, observed = Data['Price_Log'].values)
del x
display(model_to_graphviz(pooled_model))
# Sampling
with pooled_model:
pooled_trace = sample(1000, tune=1000)
# Complete Pool Parameters
CP_Parm = pd.DataFrame(pooled_trace['alpha'], columns = ['alpha'])
CP_Parm['beta'] = pooled_trace['beta']
# Figures
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
_ = ax[0].scatter(x= CP_Parm.index, y= CP_Parm.alpha, color = 'SkyBlue', edgecolors = 'Navy', label = LaTex['alpha'])
_ = ax[0].hlines(CP_Parm.alpha.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[0].set_ylim([3.06, 3.16])
_ = ax[1].scatter(x= CP_Parm.index, y= CP_Parm.beta, color = 'LimeGreen', edgecolors = 'DarkGreen', label= LaTex['beta'])
_ = ax[1].hlines(CP_Parm.beta.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
color = 'Navy', lw = 3, label = LaTex['mu_beta'])
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[1].set_ylim([-.48, -.36])
Vars = ['alpha', 'beta', 'sigma']
ax = az.plot_trace(pooled_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'alpha = %.4f, beta = %.4f' % (CP_Parm.mean().values[0],
CP_Parm.mean().values[1]))
t = np.linspace(-0.2, 1.2)
CP_fun = lambda x: CP_Parm.alpha.mean() + CP_Parm.beta.mean() *t
fig = go.Figure()
fig.add_trace(go.Scatter(x= Data['Shipping Fee'].values, y = Data['Price_Log'].values, mode='markers',
name='Shipping Fee - Price (Log)'))
fig.add_trace(go.Scatter(x= t, y= CP_fun(t), mode='lines', name='Fitted Model'))
fig.update_layout(title = 'Fitted Model', plot_bgcolor= 'WhiteSmoke',
xaxis_title= 'Shipping Fee', yaxis_title= 'Price (Log)', height = 500, width = 650)
fig['layout']['yaxis'].update(range=[-4, 8])
fig.update_layout(xaxis = dict(tickmode = 'array', tickvals = [0,1], ticktext = ['Buyer', 'Seller']))
fig.show()
del t
Note that this is similar to a Linear Regression model (Check this article for more details regarding Linear Regression model).
Temp = Data[['Price_Log', 'Shipping Fee']]
Temp.columns = ['Price_Log', 'Shipping_Fee']
Results = smf.ols('Price_Log ~ Shipping_Fee', Temp).fit()
Results.summary().tables[1]
Here, Intercept is the same as $\alpha$ and the coefficient of Shipping Fee is the same as $\beta$.
We assume that there is no connection at all between the Price (Log) levels in the different categories. In other words, we model price in each category independently. That is
$$y_{i}= \alpha_{j[i]} + \beta x_{i}+\epsilon_i,$$The model can be defind with the following considersations: \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]}, \beta \sim N(\mu, \sigma^2) \end{align}
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as unpooled_model:
alpha = Normal('alpha', mu = 0, sigma = 1e5, shape= Categories)
beta = Normal('beta', mu = 0, sigma = 1e5)
sigma = HalfCauchy('sigma', 5)
theta = alpha[Category] + beta*x
y = Normal('y', theta, sigma=sigma, observed= Data['Price_Log'].values)
display(model_to_graphviz(unpooled_model))
# Sampling
with unpooled_model:
unpooled_trace = sample(1000, tune=1000)
# Un-pooled (No Pool) Parmameters
NP_Parm = pd.DataFrame(unpooled_trace['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
NP_Parm['beta'] = unpooled_trace['beta']
fig, ax = plt.subplots(1, 1, figsize=(16, 6.5))
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
_ = ax.scatter(x= Temp.index, y= Temp['beta'], color = 'SkyBlue', edgecolors = 'Navy')
_ = ax.hlines(Temp.mean(), -1, len(Temp.index)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax.set_title(r'$\beta$', fontsize = 16)
_ = ax.grid(True)
_ = ax.set_xlim([-1, len(NP_Parm.index)+1])
_ = ax.set_ylim([-.36, -.24])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp
Vars = ['alpha', 'beta', 'sigma']
Vars = sorted(Vars)
ax = az.plot_trace(unpooled_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(unpooled_trace, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
A plot of the ordered Estimates:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 8])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-1, 8])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = NP_Parm.copy()
beta = Temp['beta'].mean()
Temp = Temp[df_search(Temp, 'alpha')]
for alpha in Temp.mean(axis=0):
ax.plot(t, alpha + beta*t, color='DarkBlue', marker='o', linestyle='--',
linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha, beta, delta
A Visual comparisons between the pooled and unpooled estimates for a subset of categories:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
s = 4
fig, ax = plt.subplots(4, 3, figsize=(14, 14))
ax = ax.ravel()
lines = []
labels = []
for i, c in enumerate(Temp.Categories[:12]):
y = Data.loc[Data.Category_Name==c, 'Price_Log'].values
x = Data.loc[Data.Category_Name==c, 'Shipping Fee'].values
NP_fun = lambda t: Temp.loc[Temp.Categories == c, 'mean'].values[0] + t*NP_Parm['beta'].mean()
ax[i].scatter(x + np.random.randn(len(x))*0.01, y, alpha=0.4)
t = np.linspace(-0.2, 1.2)
ax[i].plot(t, NP_fun(t), 'k', lw = 2, label = 'Complete Pooling')
ax[i].plot(t, CP_fun(t), 'r', lw = 2, label = 'No Pooling')
ax[i].set_xticks([0,1])
ax[i].set_title(c)
ax[i].set_ylim([0, 8])
ax[i].set_xlabel('Shipping Fee')
ax[i].set_ylabel('Price (Log)')
axLine, axLabel = ax[i].get_legend_handles_labels()
lines.extend(axLine)
labels.extend(axLabel)
del axLine, axLabel
fig.legend(lines[:2], labels[:2], bbox_to_anchor=(0.2, 1), loc='lower left',
ncol=2, title = 'Fitted Model', mode="expand", borderaxespad=0 , fontsize = 14)
plt.tight_layout()
There are Few approaches that we discuss here. For example,
The simplest possible partial pooling model for the retail price dataset is one that simply estimates prices, $\alpha$, with no other predictors and ignoring the effect of shipping, $\beta$.
In doing so, let, $$y_i = \alpha_{j[i]} + \epsilon_i$$
where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as partial_pooling:
# Priors
mu_alpha = Normal('mu_alpha', mu = 0 , sigma = 1e5)
sigma_alpha = HalfCauchy('sigma_alpha', 5)
# Random intercepts
alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
# Model error
sigma_y = HalfCauchy('sigma_y',5)
# Expected value
y_hat = alpha[Category]
# Data likelihood
y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
display(model_to_graphviz(partial_pooling))
with partial_pooling:
partial_pooling_trace = sample(1000, tune=1000)
# Partial Pooling Parmameters
PP_Parm = pd.DataFrame(partial_pooling_trace['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
Vars = ['sigma_y', 'alpha', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(partial_pooling_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(partial_pooling_trace, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 5])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
Standard Error $$SE = \frac{\sigma} {\sqrt{n}}$$
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Group = pd.DataFrame({'n': Data.groupby('Category_Name')['Train_ID'].count(),
'mean': Data.groupby('Category_Name')['Price_Log'].mean(),
'sd': Data.groupby('Category_Name')['Price_Log'].std()})
Group['se'] = Group['sd']/np.sqrt(Group['n'])
# deviations
jitter = np.random.normal(scale=0.1, size= Temp.shape[1])
fig, ax = plt.subplots(1, 2, figsize=(16,8))
_ = ax[0].scatter(Group['n'] + jitter, Group['mean'], color = 'SkyBlue', edgecolors = 'Navy')
for j, row in zip(jitter, Group.iterrows()):
n, mean, se = n, mean, se = row[1]['n'], row[1]['mean'], row[1]['se']
_ = ax[0].plot([n + j, n + j], [mean - se, mean + se], 'k-', alpha = 0.5)
_ = ax[0].set_xscale('log')
_ = ax[0].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[0].set_ylim([0,6])
_ = ax[0].set_xlim([0,1e3])
_ = ax[0].set_title('No Pool Model Estimates Mean')
_ = ax[1].scatter(Group['n'] + jitter, Temp.mean(axis=0), color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[1].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[1].set_xscale('log')
_ = ax[1].set_xlim([0,1e3])
_ = ax[1].set_ylim([0,6])
_ = ax[1].set_title('Partial Pool Model Estimates Mean')
for j, n , mean , std in zip(jitter, Group['n'].values, Temp.mean(axis=0), Temp.std(axis=0)):
_ = ax[1].plot([n+j, n+j], [mean- std, mean + std], 'k-', alpha = 0.5)
del j, n, mean, std, Temp, Group
The unpooled estimates are more imprecise than the partial pool estimates.
This model allows intercepts to vary across the category, according to a random effect.
$$y_i = \alpha_{j[i]} + \beta x_{i} + \epsilon_i$$where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as varying_intercept:
# Priors
mu_alpha = Normal('mu_alpha', mu = 0 , tau=1e-4)
sigma_alpha = HalfCauchy('sigma_alpha', 5)
# Random intercepts
alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
# Common slope
beta = Normal('beta', mu = 0 , sigma = 1e5)
# Model error
sigma_y = HalfCauchy('sigma_y',5)
# Expected value
y_hat = alpha[Category] + beta * x
# Data likelihood
y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
display(model_to_graphviz(varying_intercept))
with varying_intercept:
varying_intercept_trace = sample(1000, tune=1000)
# Varying Intercept Parmameters
VI_Parm = pd.DataFrame(varying_intercept_trace['alpha'],
columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
VI_Parm['beta'] = varying_intercept_trace['beta']
Vars = ['sigma_y', 'beta', 'alpha', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(varying_intercept_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(varying_intercept_trace, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VI_Parm.copy()
for alpha in Temp[df_search(Temp, 'alpha')].mean(axis=0):
ax.plot(t, alpha + Temp['beta'].mean()*t, color='DarkBlue', marker='o', linestyle='--',
linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as varying_intercept_pred:
# Priors
mu_alpha = Normal('mu_alpha', mu = 0 , tau=1e-4)
sigma_alpha = HalfCauchy('sigma_alpha', 5)
# Random intercepts
alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
# Common slope
beta = Normal('beta', mu = 0 , sigma = 1e5)
# Model error
sigma_y = HalfCauchy('sigma_y',5)
# Expected value
y_hat = alpha[Category] + beta * x
# Data likelihood
y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
# prediction
y_pred = Normal('y_pred', mu=alpha[Selected_Category] + beta, sigma=sigma_y)
display(model_to_graphviz(varying_intercept_pred))
with varying_intercept_pred:
varying_intercept_pred_trace = sample(1000, tune=1000)
_ = az.plot_trace(varying_intercept_pred_trace, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4))
_ = az.plot_posterior(varying_intercept_pred_trace, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
Temp0 = varying_intercept_pred_trace['y_pred']
_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
Temp0 = np.exp(Temp0)
_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp0
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
An alternative would be
$$y_i = \alpha + \beta_{j[i]} x_{i} + \epsilon_i$$where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2)\\ \alpha \sim N(0, \sigma^2)\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2) \end{cases} \end{align}
This model highlights the relationship between measured the logarithm of the price, the prevailing price level, and the effect of who pays the shipping fee at which the measurement was made.
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as varying_slope:
# Priors
mu_beta = Normal('mu_beta', mu = 0 , tau=0.0001)
sigma_beta = HalfCauchy('sigma_beta', 5)
# Random intercepts
alpha = Normal('alpha', mu = 0 , sigma = 1e5)
# Common slope
beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)
# Model error
sigma_y = HalfCauchy('sigma_y',5)
# Expected value
y_hat = alpha + beta[Category] * x
# Data likelihood
y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
display(model_to_graphviz(varying_slope))
# Sampling
with varying_slope:
varying_slope_trace = sample(1000, tune=1000)
# Varying Slope Parmameters
VS_Parm = pd.DataFrame(varying_slope_trace['beta'],
columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])
VS_Parm['alpha'] = varying_slope_trace['alpha']
VS_Parm = VS_Parm.reindex(sorted(VS_Parm.columns), axis=1)
Vars = ['sigma_y', 'beta', 'alpha', 'sigma_beta', 'mu_beta']
Vars = sorted(Vars)
ax = az.plot_trace(varying_slope_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(varying_slope_trace, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-2, 2])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-3, 3])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VS_Parm.copy()
for beta in Temp[df_search(Temp, 'beta')].mean(axis=0):
ax.plot(t, Temp['alpha'].mean() + beta*t, color='DarkBlue', marker='o', linestyle='--',
linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as varying_slope_pred:
# Priors
mu_beta = Normal('mu_beta', mu = 0 , tau=0.0001)
sigma_beta = HalfCauchy('sigma_beta', 5)
# Random intercepts
alpha = Normal('alpha', mu = 0 , sigma = 1e5)
# Common slope
beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)
# Model error
sigma_y = HalfCauchy('sigma_y',5)
# Expected value
y_hat = alpha + beta[Category] * x
# Data likelihood
y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
# prediction
y_pred = Normal('y_pred', mu=alpha + beta[Selected_Category], sigma=sigma_y)
display(model_to_graphviz(varying_slope_pred))
with varying_slope_pred:
varying_slope_pred_trace = sample(1000, tune=1000)
_ = az.plot_trace(varying_slope_pred_trace, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4))
_ = az.plot_posterior(varying_slope_pred_trace, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
Temp0 = varying_slope_pred_trace['y_pred']
_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
Temp0 = np.exp(Temp0)
_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp0
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
The most general model allows both the intercept and slope to vary by category:
$$y_i = \alpha_{j[i]} + \beta_{j[i]} x_{i} + \epsilon_i$$where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2),\\ \alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2),\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2). \end{cases} \end{align}
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as varying_intercept_slope:
# Priors
mu_alpha = Normal('mu_alpha', mu = 0 , sigma = 1e5)
sigma_alpha = HalfCauchy('sigma_alpha', 5)
mu_beta = Normal('mu_beta', mu = 0 , sigma = 1e5)
sigma_beta = HalfCauchy('sigma_beta', 5)
# Random intercepts
alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
# Random slopes
beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)
# Model error
sigma_y = Uniform('sigma_y', lower=0, upper=100)
# Expected value
y_hat = alpha[Category] + beta[Category] * x
# Data likelihood
y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
display(model_to_graphviz(varying_intercept_slope))
# Sampling
with varying_intercept_slope:
varying_intercept_slope_trace = sample(1000, tune=1000)
# Varying Slope Parmameters
VSI_Parm = pd.DataFrame(varying_intercept_slope_trace['alpha'],
columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
VSI_Parm = pd.concat([VSI_Parm, pd.DataFrame(varying_intercept_slope_trace['beta'],
columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])], axis = 1)
Vars = ['sigma_y', 'beta', 'alpha', 'sigma_beta', 'mu_beta', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(varying_intercept_slope_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if len(Vars)%2 ==1:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
_ = az.plot_posterior(varying_intercept_slope_trace, var_names=Vars, point_estimate = 'mean', ax = ax)
Temp = [LaTex[x] for x in Vars]
ax = ax.ravel()
for i in range(len(Vars)):
_ = ax[i].set_title(Temp[i], fontsize = 16)
if (len(Vars)< len(ax)):
for i in np.arange(len(Vars), len(ax)):
ax[i].set_axis_off()
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)
# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))
# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
_ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5)
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 1])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in the Effect of the Paid Shipping Fees by Category', fontsize = 18)
# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50
_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-2, 2])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VSI_Parm.copy()
for a, b in zip(Temp[df_search(Temp, 'alpha')].mean(axis=0), Temp[df_search(Temp, 'beta')].mean(axis=0)):
ax.plot(t, a + b*t, color='DarkBlue', marker='o', linestyle='--', linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']
with Model() as varying_intercept_slope_pred:
# Priors
mu_alpha = Normal('mu_alpha', mu = 0 , sigma = 1e5)
sigma_alpha = HalfCauchy('sigma_alpha', 5)
mu_beta = Normal('mu_beta', mu = 0 , sigma = 1e5)
sigma_beta = HalfCauchy('sigma_beta', 5)
# Random intercepts
alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
# Random slopes
beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)
# Model error
sigma_y = Uniform('sigma_y', lower=0, upper=100)
# Expected value
y_hat = alpha[Category] + beta[Category] * x
# Data likelihood
y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
# prediction
y_pred = Normal('y_pred', mu=alpha[Selected_Category] + beta[Selected_Category], sigma=sigma_y)
display(model_to_graphviz(varying_intercept_slope_pred))
with varying_intercept_slope_pred:
varying_intercept_slope_pred_trace = sample(1000, tune=1000)
_ = az.plot_trace(varying_intercept_slope_pred_trace, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4))
_ = az.plot_posterior(varying_intercept_slope_pred_trace, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()
Temp0 = varying_intercept_slope_pred_trace['y_pred']
_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
Temp0 = np.exp(Temp0)
_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp0
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)